/*** 
This do file gets Q1 employment decline explained by wage growth
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root 
project figstabs, root
if (r(buildrunning)==0) include "${root}/code/config_interactive.do"

* Set globals
project, uses("${root}/code/set_globals.do")
include "${root}/code/set_globals.do"
local category "Employment"

* Create required subfolders
cap mkdir "${root}/data/derived"
cap mkdir "${root}/data/derived/Employment"
cap mkdir "${root}/data/derived/CPS"
cap mkdir "${root}/results"
cap mkdir "${root}/results/Employment"
cap mkdir "${root}/results/paper numbers"
cap mkdir "${root}/results/paper numbers/`category'"

*-------------------------------------------------------------------------------
**# 1. Define and clean relevant subsample of CPS 
*-------------------------------------------------------------------------------

* We are interested in the subset of the CPS that resembles our Paychex sample (impose industry restrictions)
project, uses("${root}/data/dvc/CPS/cps_00037.dta")
use "${root}/data/dvc/CPS/cps_00037.dta", clear

* Sample restrictions
keep if age >= 16
			
* Create NAICS
gen naics = . 
replace naics = 11 if inrange(ind, 0170, 0290)
replace naics = 21 if inrange(ind, 0370, 0490)
replace naics = 23 if inrange(ind, 0770, 0770)
replace naics = 31 if inrange(ind, 1070, 1790)
replace naics = 32 if inrange(ind, 1870, 2590)
replace naics = 33 if inrange(ind, 2670, 3990)
replace naics = 42 if inrange(ind, 4070, 4590)
replace naics = 44 if inrange(ind, 4670, 5190)
replace naics = 45 if inrange(ind, 5275, 5790)
replace naics = 48 if inrange(ind, 6070, 6290)
replace naics = 49 if inrange(ind, 6370, 6390)
replace naics = 22 if inrange(ind, 0570, 0690)
replace naics = 51 if inrange(ind, 6470, 6780)
replace naics = 52 if inrange(ind, 6870, 6992)
replace naics = 53 if inrange(ind, 7070, 7190)
replace naics = 54 if inrange(ind, 7270, 7490)
replace naics = 55 if inrange(ind, 7570, 7570)
replace naics = 56 if inrange(ind, 7580, 7790)
replace naics = 61 if inrange(ind, 7860, 7890)
replace naics = 62 if inrange(ind, 7970, 8470)
replace naics = 71 if inrange(ind, 8560, 8590)
replace naics = 72 if inrange(ind, 8660, 8690)
replace naics = 81 if inrange(ind, 8770, 9290)
replace naics = 92 if inrange(ind, 9370, 9890)
						
* Be consistent with PIE and CES series
gen naics_code = ""
replace naics_code = "11" if naics == 11
replace naics_code = "21" if naics == 21
replace naics_code = "22" if naics == 22
replace naics_code = "23" if naics == 23
replace naics_code = "3133" if naics == 31 | naics == 32 | naics == 33
replace naics_code = "42" if naics == 42
replace naics_code = "4445" if naics == 44 | naics == 45
replace naics_code = "4849" if naics == 48 | naics == 49
replace naics_code = "51" if naics == 51
replace naics_code = "52" if naics == 52
replace naics_code = "53" if naics == 53
replace naics_code = "54" if naics == 54
replace naics_code = "55" if naics == 55
replace naics_code = "56" if naics == 56
replace naics_code = "61" if naics == 61
replace naics_code = "62" if naics == 62
replace naics_code = "71" if naics == 71
replace naics_code = "72" if naics == 72
replace naics_code = "81" if naics == 81
				
* Drop some sectors according to BLS adjustment 
drop if naics == 92 															// drop those working in public sector to match CES (Total Private Employment)
drop if naics == 11 															// drop those working in agriculture, forestry, fishing, and hunting according to BLS adjustment of CPS to CES
drop if naics == 9290 															// drop workers in private households such as nannies, housekeepers, etc.
			
* Drop some classes of workers according to BLS adjustment
drop if inlist(classwkr, 0, 13, 25, 26, 27, 28, 29) 							// drop missing (0), unincorporated, self-employed (13), and all public sector employees (25-29) 
			
* Keep those with jobs 
keep if empstat == 10
			
* Convert to super sector
gen naics_ss = ""
replace naics_ss = "10" if inlist(naics_code, "11", "21")
replace naics_ss = "20" if inlist(naics_code, "23")
replace naics_ss = "30" if inlist(naics_code, "31-33")
replace naics_ss = "40" if inlist(naics_code, "42", "44-45", "48-49", "22")
replace naics_ss = "50" if inlist(naics_code, "51")
replace naics_ss = "55" if inlist(naics_code, "52", "53")
replace naics_ss = "60" if inlist(naics_code, "54", "55", "56")
replace naics_ss = "65" if inlist(naics_code, "61", "62")
replace naics_ss = "70" if inlist(naics_code, "71", "72")
replace naics_ss = "80" if inlist(naics_code, "81")
			
* Reformat NAICS codes  
replace naics_code = subinstr(naics_code, "-", "_", .) 		

* Define hourly wages
cap drop wage
replace earnweek = . if earnweek > 9999 
replace uhrswork1 = . if uhrswork1 > 996
replace hourwage = . if hourwage > 999
replace hourwage = earnweek / uhrswork1 if mi(hourwage) & paidhour == 2  		// if paid hourly, divide weekly earnings by amount of hours usually worked
gen wage = hourwage if paidhour == 2
replace wage = earnweek / uhrswork1 if paidhour == 1

replace wage = 100 if wage > 100 & !mi(wage)
replace wage = 5 if wage < 5

* Add random noise to integer wages to avoid issues associated with wage bunching
set seed 1280
gen random_noise = runiform(-0.5, 0.5)
replace wage = wage + random_noise if wage == round(wage, 1) 

* Date of survey
gen date = ym(year, month)
format %tm date 

tempfile cps_cleaned 
save `cps_cleaned'

*-------------------------------------------------------------------------------
**# 2. Calculate wage growth over time
*-------------------------------------------------------------------------------

* Note if you want to change this local - code further down (part 2.2) requires the base date to be <= Jul 2020
local base_date = ym(2020, 7)                    
assert `base_date' <= ym(2020, 7)                               

*-------------------------------------------------------------------------------
**## 2.1. Reweigh samples for respondent composition differences
*-------------------------------------------------------------------------------

* Since we are using the repeated cross-section aspect of the CPS rather than the panel aspect, we need to account for 
* differences in respondent composition. We do so by reweighting samples to resemble the sample in the base period. 

*-------------------------------------------------------------------------------
* Clean demographic controls 
*-------------------------------------------------------------------------------

* Education
recode educ (2/71 = 1 "No high school diploma") (73=2 "High school diploma") (81 91 92=3 "Some tertiary") (111=4 "Bachelor's") (123/125=5 "Master's or PhD"), g(educ2)

* Race 
gen race2 = 1 if race == 100 
replace race2 = 2 if race == 200
replace race2 = 3 if race == 651 
replace race2 = 4 if mi(race2) & !mi(race)
 
* Age
assert !mi(age)
gen agegrp6 = .
replace agegrp6 = 1 if age < 25 
replace agegrp6 = 2 if inrange(age, 25, 34) 
replace agegrp6 = 3 if inrange(age, 35, 44) 
replace agegrp6 = 4 if inrange(age, 45, 54) 
replace agegrp6 = 5 if inrange(age, 55, 64) 
replace agegrp6 = 6 if age >=65

* Occupation
gen occ_3_digit = round(occ2010/10, 10)

* Hispanic 
gen hispanic = hispan != 0 if !mi(hispan)

* US-born citizen 
gen us_born_citizen = inlist(citizen, 1, 2) if !mi(citizen)

*-------------------------------------------------------------------------------
* Construct inverse probability weights to make composition of sample at date XX similar to sample in base period
*-------------------------------------------------------------------------------

gen newwt_`base_date' = earnwt                                                  // weight at base period is just CPS weights

forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
		
		cap drop pre 
		cap drop post 
	
		gen pre = 1 if date == `base_date' 									
		replace pre = 0 if date == `date'
	
		* controls: 1990 industry (more granular than NAICS), occupation, race, gender, age group, education level, geographic region, hispanic, US citizen 
		probit pre i.ind1990 i.occ_3_digit i.race2 i.sex i.agegrp6 i.educ2 i.region i.hispanic i.us_born_citizen if wage > 0 & !mi(wage) [pw = earnwt], difficult

		cap drop phat
		predict phat
			 
		* IPW, ATT: weight = 1 for post, p/(1-p) for pre *
		gen newwt_`date' =  earnwt * phat/(1-phat) if pre == 0      		    
}		

forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
				
		local post_period = `date'
		local pre_period = `date' - 1
		
		* Wage ventiles, estimated separately at each date
		cap drop wage_pct_post wage_pct_pre
		gquantiles wage_pct_post = wage if date == `post_period' [pw = newwt_`post_period'], xtile n(20)
		gquantiles wage_pct_pre = wage if date == `pre_period' [pw = newwt_`pre_period'], xtile n(20)
		
		preserve 
		gcollapse (mean) wage [pw = newwt_`post_period'], by(wage_pct_post)
		forvalues ventile = 1/20 {
			qui su wage if wage_pct_post == `ventile'
			local after_wage_`date'_`ventile' = `r(mean)'
		}
		restore 
			
		preserve 
		gcollapse (mean) wage [pw = newwt_`pre_period'], by(wage_pct_pre)
		forvalues ventile = 1/20 {
			qui su wage if wage_pct_pre == `ventile'
			local before_wage_`date'_`ventile' = `r(mean)'
		}
		restore 
}

*-------------------------------------------------------------------------------
**## 2.2. Make date-by-ventile dataset with smoothed wage growth
*-------------------------------------------------------------------------------

clear 

* Number of ventiles
set obs 20
	
* Create identifying variables (date and ventile) 
gen date = `base_date'
format date %tm

gen ventile = . 
forvalues i = 1/20 {
	replace ventile  = `i' if _n == `i'
}

* Expand to get more dates 
expand `=ym(2022, 4)' - `base_date' + 1, gen(dup)
bys ventile dup: replace date = `base_date' + _n if dup == 1
gisid date ventile

* Gen wage_growth var 
gen wage_before = . 
gen wage_after =  . 
	
* Get wage ventiles
forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	forvalues ventile = 1/20 {
		replace wage_before =  `before_wage_`date'_`ventile'' if date == `date' & ventile == `ventile' 
		replace wage_after  =  `after_wage_`date'_`ventile'' if date == `date' & ventile == `ventile' 
	}
}
	
*-------------------------------------------------------------------------------
* Get smoothed month-to-month wage growth
*-------------------------------------------------------------------------------

cap drop *sm*
gen wage_growth_sm = 0 
 
forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	qui lowess wage_after ventile if date == `date', gen(afterwage_sm_`date') nograph 
	qui lowess wage_before ventile if date == `date', gen(beforewage_sm_`date') nograph 
	replace wage_growth_sm = ln(afterwage_sm_`date') - ln(beforewage_sm_`date') if date == `date'
}

assert wage_growth_sm == 0 if date == `base_date'

* Save data 
save "${root}/data/derived/Employment/wage_growth_cps_national.dta", replace
project, creates("${root}/data/derived/Employment/wage_growth_cps_national.dta")
	
*-------------------------------------------------------------------------------
* Chain month-to-month wage growth to get cumulative wage growth since base period
*-------------------------------------------------------------------------------

gisid ventile date 	
sort ventile date
tsset ventile date, monthly

gen product_sm = wage_growth_sm + 1

* Sample selection problem - we observe substantial wage growth during the worst months of the pandemic, especially in low ventiles, likely because low-wage earners are becoming unemployed
* To correct for this, we do two things. 

* First, we match the sample composition in July 2020 instead of January 2020 (implemented above) - the idea being that sample selection may be persistent from the pre-pandemic period to the post-pandemic period,
* but does not exacerbate within the post-pandemic period (post-pandemic meaning post-worst part of pandemic)

* Second, implemented here, we impute wage growth between Jan 2020 and Jul 2020 using wage growth in 2019 (see Feb 2020 BLS report on real earnings): https://www.bls.gov/news.release/archives/realer_02132020.pdf
* We use nominal wage growth because we adjust for inflation in our income quartile definitions, so we want to match nominal wages to nominal thresholds.
* Relevant estimates - nominal mean hourly wage of $28.37 in Dec 2019 and $27.58 in Jan 2019. This gives us annualized growth rate of about 3%. 
* The imputed cumulative growth rate between Jan 2020 and Jul 2020 should then be 3 * 7/12. 

replace product_sm = 1 if inrange(date, `base_date', ym(2020, 7))
replace product_sm = 1 + (28.37 - 27.58) / 27.58 * 7/12 if date == ym(2020, 7)
bysort ventile: replace product_sm = product_sm * L.product_sm if !inrange(date, `base_date', ym(2020, 7))

* Convert date to daily format
keep date ventile product_sm wage_growth_sm
replace date = mdy(month(dofm(date)), 15, year(dofm(date)))
format %td date
	
* Save 
save "${root}/data/derived/CPS/cps_wage_cutoffs_national.dta", replace
project, creates("${root}/data/derived/CPS/cps_wage_cutoffs_national.dta")

*-------------------------------------------------------------------------------
**# 3. Get employment in each quartile explained by wage growth
*-------------------------------------------------------------------------------

* "Explained by wage growth" means - holding total employment and respondent composition fixed, how do workers move between quartiles?
* In particular - how many workers who were Q1 in the base period move to Q2 and above by the end date, assuming they all stay employed (and there are no new entrants into employment)

use `cps_cleaned', clear				
keep if date == `base_date' 
keep if wage > 0 & !mi(wage)
	
* Get base period wage ventiles
gquantiles ventile = wage [pw = earnwt], xtile n(20)
	
* Stack copies of the base period data - basically, we want to keep the sample in the base period fixed
preserve 

forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	replace date = `date'

	tempfile date_`date'
	save `date_`date''
}
restore
	
forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	append using `date_`date''
}
	
* Convert date to daily format
replace date = mdy(month(dofm(date)), 15, year(dofm(date)))
format %td date
	
* Merge in wage growth rates by ventile
project, uses("${root}/data/derived/CPS/cps_wage_cutoffs_national.dta") derived
merge m:1 ventile date using "${root}/data/derived/CPS/cps_wage_cutoffs_national.dta", assert(3) nogen

* Change wages using growth in ventiles 
replace wage = wage * product_sm

* Define income quartiles using federal poverty thresholds
project, uses("${root}/data/dvc/Employment/poverty_thresholds.dta")
merge m:1 date using "${root}/data/dvc/Employment/poverty_thresholds.dta", keep(3)
	
gen quartile = . 
replace quartile = 1 if wage <= poverty_100 & !mi(wage)
replace quartile = 2 if wage > poverty_100 & wage <= poverty_150 & !mi(wage)
replace quartile = 3 if wage > poverty_150 & wage <= poverty_250 & !mi(wage)
replace quartile = 4 if wage > poverty_250 & !mi(wage)	
assert !mi(quartile)
	
* Total employment at each date and quartile
gen count = 1 
gcollapse (sum) employment_cps = count [pw = earnwt], by(date quartile)

* Employment change relative to base period, in percentage points 
gen temp = employment_cps if date == mdy(month(dofm(`base_date')), 15, year(dofm(`base_date')))
gegen base = mean(temp), by(quartile)
assert base == temp if !mi(temp)
gen emp_wage_growth = 100 * (employment_cps / base - 1)
	
* Save 
gen month = month(date)
gen year = year(date)
keep date quartile emp_wage_growth year month employment_cps

save "${root}/data/derived/Employment/employment_from_wage_growth_national.dta", replace 
project, creates("${root}/data/derived/Employment/employment_from_wage_growth_national.dta")
	
*-------------------------------------------------------------------------------
**# 4. Get ratio of Q1 employment explained by wage growth and graph Tracker Q1
*-------------------------------------------------------------------------------

* Get Tracker data 
project, uses("${root}/data/web/data/Employment - National - Weekly.csv")
import delimited "${root}/data/web/data/Employment - National - Weekly.csv", clear
gen date = mdy(month, day_endofweek, year)
format %td date
	
* Keep only necessary variables and express employment changes in percentage points
keep date emp_incq*
	
forvalues q = 1/4 {
	replace emp_incq`q' = emp_incq`q' * 100
}
	
* Reshape 
greshape long emp_incq, i(date) j(quartile)
	
* Get month and year 
gen month = month(date)
gen year = year(date)

* Drop missing 
drop if mi(emp_incq)
	
* Merge employment just from wage growth
project, uses("${root}/data/derived/Employment/employment_from_wage_growth_national.dta") derived
merge m:1 quartile month year using "${root}/data/derived/Employment/employment_from_wage_growth_national.dta", keepusing(emp_wage_growth) keep(1 3) nogen 

sort quartile date
su emp_wage_growth if quartile == 1 & date == ${finaldate}
local adjust_mean = `r(mean)'
su emp_incq if quartile == 1 & date == ${finaldate}
local pp = `r(mean)'

* Proportion of Q1 decline explained by wage growth
local prop_expl = string(100 * `adjust_mean' / `pp', "%3.0f")
local prop_not_expl = 100 - `prop_expl'
di in red "% Decline Explained by Wage Growth: `prop_expl'%"

* Proportion of Q1 decline unexplained by wage growth
local pp_expl : di %3.1f -1*(`adjust_mean')
local pp_not_expl : di %3.1f -1*(`pp' - `adjust_mean')
di `pp_expl'
di `pp_not_expl'

local pos_1 = - `pp_expl'/2 - 1.5
local pos_2 = -`pp_expl' - `pp_not_expl'/2 - 1.5

* Make figure 
tw	(line emp_incq date if quartile == 1 & year(date) >= 2020 & date <= ${finaldate}, sort color(oi1)) ///
	(pcarrowi -0.3 `=${finaldate}+10' `=-`pp_not_expl'+0.2' `=${finaldate}+10', color(gs8)) (pcarrowi `=-`pp_not_expl'+0.2' `=${finaldate}+10' -0.3 `=${finaldate}+10', color(gs8)) ///
	(pcarrowi `pp' `=${finaldate}+10' `=-`pp_not_expl'-0.2' `=${finaldate}+10', color(oi2)) (pcarrowi `=-`pp_not_expl'-0.2' `=${finaldate}+10' `pp' `=${finaldate}+10', color(oi2)) ///
	, ///
	ytitle("Change in Employment (%)" "Relative to January 2020") xtitle("") ///
	yline(0, lcolor(gs8)) graphregion(m(r+20)) ///
	xline(`=${finaldate}', lpattern(dash) lcolor(gs8)) ///
	text(3 `=${finaldate}' "Dec 31", color(black) size(2.5)) ///
	ylabel(0 "0%" -10 "-10%" -20 "-20%" -30 "-30%" -40 "-40%" ,nogrid) ///
	xlabel( `=mdy(1,15,2020)' `""Jan" "2020""' `=mdy(3,15,2020)' "Mar" `=mdy(5,15,2020)' "May" `=mdy(7,15,2020)' "July" `=mdy(9,15,2020)' "Sep" `=mdy(11,15,2020)' "Nov" ///
	`=mdy(1,15,2021)' `""Jan" "2021""' `=mdy(3,15,2021)' "Mar" `=mdy(5,15,2021)' "May" `=mdy(7,15,2021)' "July" `=mdy(9,15,2021)' "Sep" `=mdy(11,15,2021)' "Nov", labsize(small))	///
	legend(off)  ///
	text(`=`pos_1'-1' `=${finaldate} + 20' "`prop_not_expl'% (`pp_not_expl' p.p.)" "of decline" "unexplained", color(gs8) size(small) placement(east) justification(left)) ///
	text(`=`pos_2'-1' `=${finaldate} + 20' "`prop_expl'% (`pp_expl' p.p.)" "of decline" "explained" "by wage growth", color(oi2) size(small) placement(east) justification(left))
	
oi_graph_export "${root}/results/Employment/Percent of Q1 Decline Explained by Wage Growth", type("${fig_type}")

*-------------------------------------------------------------------------------
* Export output numbers to csv file
*-------------------------------------------------------------------------------

* 1 d.p. if first decimal place is nonzero; integer rounded if first decimal place is zero 
local pp_not_expl_str: di %3.1f `pp_not_expl'
if strpos("`pp_not_expl_str'", "0") == strlen("`pp_not_expl_str'") local pp_not_expl_fmt "%2.0f"
else local pp_not_expl_fmt "%3.1f"

cap erase "${root}/results/paper numbers/`category'/Percent of Q1 Decline Explained by Wage Growth.yaml"

yamlout using "${root}/results/paper numbers/`category'/Percent of Q1 Decline Explained by Wage Growth.yaml", ///
	key("expl_pp") ///
	comment("Decline explained by wage growth (p.p.)") ///
	value(`pp_expl') fmt(%9.1f)
yamlout using "${root}/results/paper numbers/`category'/Percent of Q1 Decline Explained by Wage Growth.yaml", ///
	key("expl_percent") ///
	comment("Decline explained by wage growth (%)") ///
	value(`prop_expl') fmt(%9.1f)
yamlout using "${root}/results/paper numbers/`category'/Percent of Q1 Decline Explained by Wage Growth.yaml", ///
	key("notexpl_pp") ///
	comment("Decline not explained by wage growth (p.p.)") ///
	value(`pp_not_expl') fmt(`pp_not_expl_fmt')
yamlout using "${root}/results/paper numbers/`category'/Percent of Q1 Decline Explained by Wage Growth.yaml", ///
	key("notexpl_percent") ///
	comment("Decline not explained by wage growth (%)") ///
	value(`prop_not_expl') fmt(%9.1f)
	
project, creates("${root}/results/paper numbers/`category'/Percent of Q1 Decline Explained by Wage Growth.yaml")
